Key Points

14,744 patients had dynamic data including heart rate, systolic blood pressure, diastolic blood pressure, pulse pressure, respiration rate and blood oxygen saturation for the first 4h. 253 out of them died within 24h after entering in ICU, after 48h the number increased to 515.

Notably, there is a clear separation between people who died in ICU vs the ones who did not, at the starting point


rm(list = ls())
library(data.table) # For fast loading and pre-processing of large datasets
packageVersion("data.table")
library(ggplot2)
library(MASS)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
    library(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This does the summary. For each group's data frame, return a vector with
    # N, mean, and sd
    datac <- ddply(data, groupvars, .drop=.drop,
      .fun = function(xx, col) {
        c(N    = length2(xx[[col]], na.rm=na.rm),
          mean = mean   (xx[[col]], na.rm=na.rm),
          sd   = sd     (xx[[col]], na.rm=na.rm)
        )
      },
      measurevar
    )

    # Rename the "mean" column    
    datac <- rename(datac, c("mean" = measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}    
summarySEmedian <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
    library(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This does the summary. For each group's data frame, return a vector with
    # N, mean, and sd
    datac <- ddply(data, groupvars, .drop=.drop,
      .fun = function(xx, col) {
        c(N    = length2(xx[[col]], na.rm=na.rm),
          median = median   (xx[[col]], na.rm=na.rm),
          sd   = sd     (xx[[col]], na.rm=na.rm)
        )
      },
      measurevar
    )

    # Rename the "mean" column    
    datac <- rename(datac, c("median" = measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}
mergeDynChart <- function(rawDF, dynvariable, na.action = pass, na.rm=FALSE,
                           all=TRUE) {
  
  mergeDF <- merge(rawDF[, c("icustay_id", "subject_id", "itemid", "charttime", "valuenum", "valueuom")], PatientInfo,  by = c("icustay_id", "subject_id"), all = TRUE )
  
  mergeDF$time <- round(unclass(difftime(as.POSIXlt(mergeDF$charttime), as.POSIXlt(mergeDF$intime), units="hours")))
  
  mergeHours <- aggregate(as.numeric(valuenum) ~ icustay_id + subject_id + death_icu + time,  data = mergeDF, FUN = "mean", na.rm=FALSE, na.action = na.pass)

  names(mergeHours) <- c("icustay_id", "subject_id", "death_icu", "time", deparse(substitute(dynvariable)))
  
  mergeHours <- mergeHours[mergeHours$time >= 0,]
  mergeHours[5] <- round(mergeHours[5], 2)
  
  results <- list("mergeDF" = mergeDF, "mergeHours" = mergeHours)

  return(results)
  
}

mergeDynLab <- function(rawDF, dynvariable, na.action = pass, na.rm=FALSE,
                           all=TRUE) {
  
  mergeDF <- merge(rawDF[, c("icustay_id", "subject_id", "itemid", "charttime", "valuenum", "valueuom")], PatientInfo,  by = c("subject_id"), all = TRUE )
  
  mergeDF$time <- round(unclass(difftime(as.POSIXlt(mergeDF$charttime), as.POSIXlt(mergeDF$intime), units="hours")))
  
  mergeHours <- aggregate(as.numeric(valuenum) ~ icustay_id + subject_id + death_icu + time,  data = mergeDF, FUN = "mean", na.rm=FALSE, na.action = na.pass)

  names(mergeHours) <- c("icustay_id", "subject_id", "death_icu", "time", deparse(substitute(dynvariable)))
  
  mergeHours <- mergeHours[mergeHours$time >= 0,]
  mergeHours[5] <- round(mergeHours[5], 2)
  
  results <- list("mergeDF" = mergeDF, "mergeHours" = mergeHours)

  return(results)
  
}

StatisticalRange <- function(value, alpha=0.01){
  
  library(MASS)
  
  x <- value
  
  N <- length(value)
  aux <- boxcox( lm(x ~ 1) )
  lambda <- aux$x[which(aux$y == max(aux$y))]
  
  x1 <- (x^lambda - 1)/lambda # (with x1 <- log(x) if lambda = 0)
  
  lowertrans <- qnorm(alpha/N, mean = mean(x1), sd = sd(x1))
  uppertrans <- qnorm(1-alpha/N, mean = mean(x1), sd = sd(x1))
  
  lower <- (lowertrans*lambda + 1)^(1/lambda) 
  upper <- (uppertrans*lambda + 1)^(1/lambda) 
  
  results <- list( "lower" = round(lower, 2), "upper" = round(upper,2))
  
  return(results)
}

basicAnalysis <- function(mergeHours, dynvariable, variablefreetext="HR" ) {
  
  died1 <- mergeHours[mergeHours$death_icu == 1, ]
  died0 <- mergeHours[mergeHours$death_icu == 0, ]
  
  #summary(died1)
  #summary(died0)
  
  BasicBoxplot <- boxplot(died1[,5], died0[,5], 
        names = c("Died in ICU", "Did not die in ICU"),
        main = paste(variablefreetext, "stratified by death in ICU"))
  
  results <- list("died1" = died1, "died0" = died0, "BasicBoxplot" = BasicBoxplot)
  
  return(results)      
}

medianValuesPlot <- function(died1, died0, dynvariable, hours=72, na.rm = TRUE){
  
  vartext <- deparse(substitute(dynvariable))
  
  median1 <- summarySEmedian(died1, measurevar=vartext, groupvars=c("time"), na.rm = TRUE)
  median0 <- summarySEmedian(died0, measurevar=vartext, groupvars=c("time"), na.rm = TRUE)
  
  names(median1) <- c("time", "N", "var", "sd", "se", "ci")
  names(median0) <- c("time", "N", "var", "sd", "se", "ci")
  
  #All times
  plotAlltimes0 <- ggplot(median0, aes(x=time, y=var)) + 
                   geom_errorbar(aes(ymin=var-se, ymax=var+se), width=.1) +
                   ggtitle(paste(vartext, "(died = 0)"))
  
  plotAlltimes1 <- ggplot(median1, aes(x=time, y=var)) + 
                    geom_errorbar(aes(ymin=var-se, ymax=var+se), width=.1) +
                    ggtitle(paste(vartext, "(died = 1)"))

  plot <- ggplot() +
              geom_line(data = median1[median1$time <= hours, ], aes(x=time, y=var), color = "blue") +
              geom_line(data = median0[median0$time <= hours, ], aes(x=time, y=var)) +
              theme_minimal() + 
              theme(plot.title = element_text(size = 14, family = "Tahoma", face = "bold", hjust = 0.5),
                          text = element_text(size = 12, family = "Tahoma"),
                          axis.title = element_text(face="bold"),
                          axis.text.x=element_text(size = 11),
                          legend.position = "bottom") +
              ggtitle(paste(vartext,"after", hours, "hours h in ICU"))

  results <- list("median1" = median1, "median0" = median0,
                  "plotAlltimes0" = plotAlltimes0, "plotAlltimes1" = plotAlltimes0, "plotMedian" = plot)
  
  return(results)  
}


PlotMaxMin <- function(died1, died0, dynvariable, hours=360){
      
      names(died1) <- c("icustay_id","subject_id", "death_icu", "time", "var")
        
      names(died0) <- c("icustay_id","subject_id", "death_icu", "time", "var")
      
      max_1 <- aggregate(var ~ time,  data = died1, FUN = "max")
      min_1 <- aggregate(var ~ time,  data = died1, FUN = "min")
      max_0 <- aggregate(var ~ time,  data = died0, FUN = "max")
      min_0 <- aggregate(var ~ time,  data = died0, FUN = "min")
      
      
      plot1 <- ggplot() +
                geom_line(data = max_1[max_1$time < hours,], aes(x=time, y=var), color = "blue") +
                geom_line(data = min_1[min_1$time < hours,], aes(x=time, y=var), color = "blue") +
                geom_line(data = max_0[max_0$time < hours,], aes(x=time, y=var), color = "black") +
                geom_line(data = min_0[min_0$time < hours,], aes(x=time, y=var), color = "black") +
                theme_minimal() + 
                labs(title = paste("Maximum and minimum", dynvariable, "values in the first",hours ,"hours in ICU"), 
                     x="Time (h)")
      
      results <- list("plotMaxMin" = plot1 )
      
      return(results)
}

SpaguettiPlot <- function(died1, died0, variablefreetext, patients=50, hours=360){
  
    names(died1) <- c("icustay_id","subject_id", "death_icu", "time", "var")
    names(died0) <- c("icustay_id","subject_id", "death_icu", "time", "var")
    
    # random sample
    set.seed(150)
    
    indexes1 = sample(died1$icustay_id, size=patients)
    
    SpagPlot1 <-died1[died1$icustay_id %in%indexes1 & died1$time < hours,]
    
    set.seed(151)
    indexes0 = sample(died0$icustay_id, size=patients)
    
    SpagPlot0 <-died0[died0$icustay_id %in%indexes0 & died0$time < hours,]
    
    interaction.plot(SpagPlot1$time, SpagPlot1$icustay_id, SpagPlot1$var,
                     xlab = "Time", 
                     main=(paste(variablefreetext, "in", patients, "patients who died in ICU")),
                     legend = F)
    
    interaction.plot(SpagPlot0$time, SpagPlot0$icustay_id, SpagPlot0$var,
                     xlab = "Time", 
                     main=(paste(variablefreetext, "in", patients, "patients who did not die in ICU")) ,
                     legend = F)
}

BoxPlot4h <- function(died0, died1, hours, dynvariable, ylab=200){
  
  names(died1) <- c("icustay_id","subject_id", "death_icu", "time", "var")
  names(died0) <- c("icustay_id","subject_id", "death_icu", "time", "var")
  names(hours) <- c("icustay_id","subject_id","death_icu","time","var")
  
  died00 <- length(unique(died0[died0$time ==0,]$icustay_id))
  died01 <- length(unique(died0[died0$time ==1,]$icustay_id))
  died02 <- length(unique(died0[died0$time ==2,]$icustay_id))
  died03 <- length(unique(died0[died0$time ==3,]$icustay_id))
  died04 <- length(unique(died0[died0$time ==4,]$icustay_id))

  died10 <- length(unique(died1[died1$time ==0,]$icustay_id))
  died11 <- length(unique(died1[died1$time ==1,]$icustay_id))
  died12 <- length(unique(died1[died1$time ==2,]$icustay_id))
  died13 <- length(unique(died1[died1$time ==3,]$icustay_id))
  died14 <- length(unique(died1[died1$time ==4,]$icustay_id))
  
  BoxPlot <- ggplot() +
              geom_boxplot(data = hours[hours$time==0,], aes(x=time, y=var, fill = as.factor(death_icu)), alpha=0.7) +
              geom_boxplot(data = hours[hours$time==1,], aes(x=time, y=var, fill = as.factor(death_icu)), alpha=0.7) +
              geom_boxplot(data = hours[hours$time==2,], aes(x=time, y=var, fill = as.factor(death_icu)), alpha=0.7) +
              geom_boxplot(data = hours[hours$time==3,], aes(x=time, y=var, fill = as.factor(death_icu)), alpha=0.7) +
              geom_boxplot(data = hours[hours$time==4,], aes(x=time, y=var, fill = as.factor(death_icu)), alpha=0.7) +
              theme_minimal() + 
              theme(plot.title = element_text(size = 14, family = "Tahoma", face = "bold", hjust = 0.5),
                          text = element_text(size = 12, family = "Tahoma"),
                          axis.title = element_text(face="bold"),
                          axis.text.x=element_text(size = 11),
                          legend.position = "bottom") +
              ggtitle(paste("Boxplot of", dynvariable, " in the first 4 hours")) +
              scale_x_discrete(name ="Time (hours)", limits = c("1", "2", "3", "4")) +
              annotate("text", x=-0.25, y=ylab, label= paste("n=", died00), size = 3.2) +
              annotate("text", x=0.25, y=ylab, label= paste("n=", died10), size = 3.2) +
              annotate("text", x=0.75, y=ylab, label= paste("n=", died01), size = 3.2) +
              annotate("text", x=1.25, y=ylab, label= paste("n=", died11), size = 3.2) +
              annotate("text", x=1.75, y=ylab, label= paste("n=", died02),size = 3.2) +
              annotate("text", x=2.25, y=ylab, label= paste("n=", died12), size = 3.2) +
              annotate("text", x=2.75, y=ylab, label= paste("n=", died03), size = 3.2) +
              annotate("text", x=3.25, y=ylab, label= paste("n=", died13), size = 3.2) +
              annotate("text", x=3.75, y=ylab, label= paste("n=", died04), size = 3.2) +
              annotate("text", x=4.25, y=ylab, label= paste("n=", died14), size = 3.2)
  
  results <- list("BoxPlot" = BoxPlot)
  
  return(results)
  
}

NAimputation <- function(Df, col){
  
  for (i in unique(Df$icustay_id)){
    
    if (is.na(Df[Df$icustay_id == i & Df$time == 0,][col])){
      Df[Df$icustay_id == i & Df$time == 0,][col] <- Df[Df$icustay_id == i & Df$time == 1,][col]
    }
  
    if (is.na(Df[Df$icustay_id == i & Df$time == 3,][col])){
      Df[Df$icustay_id == i & Df$time == 3,][col] <- Df[Df$icustay_id == i & Df$time == 2,][col]
    }
  
    if (is.na(Df[Df$icustay_id == i & Df$time == 1,][col])){
      
      average_value1 <- (Df[Df$icustay_id == i & Df$time == 0,][col] + Df[Df$icustay_id == i & Df$time == 2,][col])/2
      
      Df[Df$icustay_id == i & Df$time == 1,][col] <- average_value1
    }
    
    if (is.na(Df[Df$icustay_id == i & Df$time == 2,][col])){
      
      average_value2 <- (Df[Df$icustay_id == i & Df$time == 1,][col] + Df[Df$icustay_id == i & Df$time == 3,][col])/2
      
      Df[raw_4H2$icustay_id == i & Df$time == 2,][col] <- average_value2
    }
    
  }
  
  return(Df)
}

1. Data extraction

Defining the dynamic variables

  1. The datasets were downloaded from physionet.org as compressed files, and were decompressed before building the database (after decompression: 46.58 GB).

  2. The MIMICIII database was built following this tutorial, which creates first the tables and then add the data.

  3. To get familiarise with the database and SQL query language, I completed the tutorial in the MIMICIII official website.

For example, this query counts the number of patients who died in the hospital:

SELECT expire_flag, COUNT(*)
FROM patients
GROUP BY expire_flag;
  1. Based on the previous publication by Calvert et al, 2016, we were interested in extracting the data for the following eight common clinical variables: heart rate, pH, pulse pressure, respiration rate, blood oxygen saturation, systolic blood pressure, temperature and white blood cell count. These variables are coded using a ITEMID, which is an identifier for a single measurement type in the database. Each row associated with one ITEMID (e.g. 212) corresponds to an instantiation of the same measurement (e.g. heart rate).

There are some important considerations:

  • D_ITEMS is sourced from two distinct ICU databases. The main consequence is that there are duplicate ITEMID for each concept. For example, heart rate is captured both as an ITEMID of 212 (CareVue) and as an ITEMID of 220045 (Metavision). As a result, it is necessary to search for multiple ITEMID to capture a single concept across the entire database. All Metavision ITEMIDs will have a value > 220000.

  • Another source of duplicate ITEMID is due to the free text nature of data entry in CareVue - as a result there are additional ITEMID which correspond to misspellings or synonymous descriptions of a single concept. It is important to search for all possible abbreviations and descriptions of a concept to capture all associated ITEMID.

The ITEMID for the variables of interest were manually look up in the database:

  • Heart rate.
SELECT *
FROM d_items
WHERE label LIKE "heart rate";

The query resulted in 2 rows: 211 and 220045, as indicated above. It is measured in bpm (just for the metavision records).

  • pH.
SELECT *
FROM d_items
WHERE label LIKE "%ph%"
ORDER BY label;

The query resulted in 433 rows, and the following were manually selected: pH (1673, 7459) pH arterial(4753, 223830, 780, 1126), pH (cap?) (4202), pH dipstick (220734), pH other (3839), pH SOFT (228243), pH venous (220274, 860), pH level (6003). In case we also want to include urine pH (6754, 1352, 1495, 1880, 17262).

  • Pulse pressure. It is the difference between the systolic and diastolic blood pressure. It is measured in millimeters of mercury (mmHg). Therefore, the diastolic pressure in addition to the systolic pressure needs to be extracted.

  • Respiration rate.

SELECT *
FROM d_items
WHERE label LIKE "%respiratory%"
ORDER BY label;
SELECT *
FROM d_items
WHERE label LIKE "%respiration%"
ORDER BY label;

The query results in 18 rows, and the following are manually selected: repiratory rate coded as 618 , 220210 and 224690 (Respiratory Rate (Total)), 228234 (PAR-Respiration). Note there are additional items such as respiratory rate set, when is controlled by the respiratory centre (224688, 619) or respiratory rate spontaneous (224689).

  • Blood oxygen saturation.
SELECT *
FROM d_items
WHERE label LIKE "%oxygen%"
ORDER BY label;

The query results in 7 rows and PAR-Oxygen saturation is manually selected (228232).

SELECT *
FROM d_items
WHERE label LIKE "%saturation%"
ORDER BY label;

The query results in 10 rows and the manual selection includes: PAR-Oxygen Saturation (228232), systemic vascular resistance(SVR) %O2 Saturation (226865), right atrial(RA) %O2 Saturation (226860), peripherial vascular resistance (PVR) %O2 Saturation (226863), Post Anesthetic Recovery (PAR) %O2 Saturation (228232), arterial partial pressure (PA) %O2 Saturation (226862), pulse oxeginity saturation (220277), arterial O2 Saturation (220227) and ART %O2 saturation (226861).

  • Systolic blood pressure.
SELECT *
FROM d_items
WHERE label LIKE "%systolic%"
ORDER BY label;

The query returns 27 rows, from which arterial blood pressure systolic is selected: 225309, 220050, 6071, 51. Also, manual measurements: manual blood pressure systolic left (224167) and manual blood pressure systolic right (227243).

  • Temperature.
SELECT *
FROM d_items
WHERE label LIKE "%temperature%"
ORDER BY label;

The query results in 16 rows and the manual selection included: temperature In F (679,678, 223761) and Celsius (677, 676,223762), and score based on ApacheIV (227054).

  • White blood cell count.
SELECT *
FROM d_items
WHERE label LIKE "%white%"
ORDER BY label;

The query returns only one row: WhiteBloodC 4.0-11.0 (3834). These are in LABEVENTS.

Johnson et al (2017) made available online the code used to run their analysis. It includes information about ITEMID for heart rate, systolic blood pressure, temperature, white blood cell count and respiration rate (this last one only for the carevue records). Therefore, heart rate, systolic blood pressure, temperature and white blood cell count records are extracted here with the same ITEMID. As for respiration rate, we also included metavision values.

Extracting the dynamic data for the cohort of study

Extracting the dynamic data for our cohort. This include the 31,545 patients as defined in DescMIMICIII.Rmd.

The static data is stored as StaticCohort.csv, in which patients are identified with the icustay_id. This column is extracted separated by commas so it can be used to query in MySQL. In bash:

tail -n+2 StaticCohort.csv | cut -f1 | tr "\n" "," > StaticCohortID.txt

In the next steps, #list of icustay_id are the 31,545 icustay_id stored in StaticCohortID.txt.

  • Heart rate:
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM chartevents
WHERE icustay_id IN (#list of icustay_id)
AND itemid IN 
  (
      211 -- Heart Rate
    , 220045 -- Heart Rate
    );

Results saved as StaticCohortHR.csv. To double-check if the extract was done okay, the number of patients are extracted in bash (should be 25,145), and the first 6 lines shown:

cat StaticCohortHR.csv | cut -d"," -f1 | sort -u | wc -l
head StaticCohortHR.csv

Then it is compressed as follows:

cat StaticCohortHR.csv | gzip > StaticCohortHR.csv.gz
  • Systolic Blood Pressure
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM chartevents
WHERE icustay_id IN (#list of icustay_id)
AND itemid IN
  (
      6 -- ABP [Systolic]
    , 51 -- Arterial BP [Systolic]
    , 455 -- NBP [Systolic]
    , 6701 -- Arterial BP #2 [Systolic]
    , 220050 -- Arterial Blood Pressure systolic
    , 220179 -- Non Invasive Blood Pressure systolic
  );

Results saved as StaticCohortSBP.csv.

  • Temperature
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM chartevents
WHERE icustay_id IN (#list of icustay_id)
AND itemid IN
  (
      676 -- Temperature C
    , 677 -- Temperature C (calc)
    , 678 -- Temperature F
    , 679 -- Temperature F (calc)
    , 223761 -- Temperature Fahrenheit
    , 223762 -- Temperature Celsius
  );

Results saved as StaticCohortTEMP.csv.

  • White blood cell count. This is given in the LABEVENTS chart, which uses SUBJECT_ID instead of ICUSTAY_ID to identify the patients. Therefore, first the SUBJECT_IDs for our cohort are extracted based on the ICUSTAY_IDs, and used to get the white blood cell count measuremet.
SELECT subject_id, icustay_id
FROM icustays
WHERE icustay_id IN (#list of icustay_id);

The query is exported as StaticCohortSUBJECTID.csv. Then, the SUBJECTID in bash:

tail -n+2 StaticCohortPATIENTID.csv | cut -d "," -f1 | tr "\n" "," > StaticCohortSUBJECTID.txt

In the next steps, #list of subject_id are the 31,545 subject_id stored in StaticCohortSUBJECTID.txt.

SELECT subject_id, itemid, charttime, valuenum, valueuom
FROM labevents
WHERE subject_id IN (#list of subject_id)
AND itemid IN 
 (
      51300 -- WBC
    , 51301 -- White Blood Cells
  );

Results saved as StaticCohortWBC.csv.

SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM icustays
WHERE icustay_id IN (#list of icustay_id);
AND itemid IN 
 (
      618    --  Respiratory Rate
    , 615    --  Resp Rate (Total)
    , 220210 --  Respiratory Rate
    , 224690 --  Respiratory Rate (Total)
  )

Results saved as StaticCohortRR.csv. Then zip bash cat StaticCohortHR.csv | gzip > StaticCohortHR.csv.gz

  • pH. Excluded for now. It was not extracted. The ITEMID was manually selected. The query resulted in 433 rows, and the following were manually selected: pH (1673, 7459) pH arterial(4753, 223830, 780, 1126), pH (cap?) (4202), pH dipstick (220734), pH other (3839), pH SOFT (228243), pH venous (220274, 860), pH level (6003). In case we also want to include urine pH (6754, 1352, 1495, 1880, 17262).
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM labevents
WHERE icustay_id IN (#list of subject_id)
AND itemid IN 
 (
      1673    --  PH
    , 7459    --  Ph
    , 4753    -- pH (Art)
    , 223830  -- PH (Arterial) 
    , 780     --  Arterial pH
    , 1126    -- Art.pH
    , 220274  -- PH (Venous)
    , 860     -- Venous pH
  );

Results saved as StaticCohortPH.csv.

  • Blood oxygen saturation. In addition to SpO2 (as done by Johnson et al), SaO2 was also extracted.
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM icustays
WHERE icustay_id IN (#list of icustay_id)
AND itemid IN 
 (
      646    --  SpO2
    , 220277 --  SpO2
    , 834    --  SaO2 
    , 220227 --  Arterial O2 Saturation (SaO2)
  );

Results saved as StaticCohortO2S.csv.

  • Pulse pressure. It is the difference between the systolic and diastolic blood pressure. It is measured in millimeters of mercury (mmHg). Therefore, the diastolic pressure in addition to the systolic pressure needs to be extracted. The ITEMID are chosen as defined by Johnson et al (2016).
SELECT subject_id, hadm_id, icustay_id, itemid, charttime, storetime, valuenum, valueuom
FROM chartevents
WHERE icustay_id IN (#list of icustay_id)
AND itemid IN 
 (
      8368   --  Arterial BP [Diastolic]
    , 8440   --  Manual BP [Diastolic]
    , 8441   --  NBP [Diastolic]
    , 8555 --  Arterial BP #2 [Diastolic]
    , 220180 --  Non Invasive Blood Pressure diastolic
    , 220051   --  Arterial Blood Pressure diastolic
  );

Results saved as StaticCohortDBP.csv.

Admission and discharged times

To extract the outtime information (it should match the death time for patients who died in ICU, )

SELECT subject_id, icustay_id, intime, outtime 
FROM icustays
WHERE icustay_id IN (#list of icustay_id);

The results are exported as StaticCohortTIME.csv

In bash, we extracted the death status for the icustay_id

cat StaticCohort.csv | cut -f1,4 > DeathStatusSC.csv

2. Building a single dataset for dynamic data

# Reading the expression counts
olddir <- getwd()
setwd(data.path)

# Load IcustayID, SubjectID, INTIME
PatientTIME <- fread(input = 'cat < /Users/maria/Desktop/MIMIC-III/mappingID/StaticCohortTIME.csv ')
# Load Death Status in ICU ( in bash: cat StaticCohort.csv | cut -f1,4 > DeathStatusSC.csv)
DeathICU <- fread(input = 'cat < /Users/maria/Desktop/MIMIC-III/mappingID/DeathStatusSC.csv ')

#Merge intime with death status
PatientInfo <- merge(DeathICU, PatientTIME,  by = c("icustay_id"))

# Load dynamic data
DfHR <- fread(input = 'zcat < StaticCohortHR.csv.gz')
DfDBP <- fread(input = 'zcat < StaticCohortDBP.csv.gz')
DfO2S <- fread(input = 'zcat < StaticCohortO2S.csv.gz')
# DfPH <- fread(input = 'zcat < StaticCohortPH.csv.gz')
DfRR <- fread(input = 'zcat < StaticCohortRR.csv.gz')
DfSBP <- fread(input = 'zcat < StaticCohortSBP.csv.gz')
DfTEMP <- fread(input = 'zcat < StaticCohortTEMP.csv.gz')
DfWBC <- fread(input = 'zcat < StaticCohortWBC.csv.gz')

#DfTEMP <- fread(input = 'zcat < Users/maria/Desktop/MIMIC-III/dynamic_dat/StaticCohortTEMP.csv.gz')

# We can extract death from here
setwd(olddir)

We were interested in extracting the data for the following eight common clinical variables: heart rate, pH, pulse pressure, respiration rate, blood oxygen saturation, systolic blood pressure, temperature and white blood cell count.

  • Heart rate: 25,145
  • Diastolic blood pressure: 25,142
  • Blood oxygen saturation: 25,137
  • Respiration rate: 25,133
  • Systolic blood pressure: 25,142
  • Temperature: 25,087
  • White blood cell count: 25,094
length(unique(DfHR$icustay_id))
length(unique(DfDBP$icustay_id))
length(unique(DfO2S$icustay_id))
#length(unique(DfPH$icustay_id))
length(unique(DfRR$icustay_id))
length(unique(DfSBP$icustay_id))
length(unique(DfTEMP$icustay_id))

length(unique(DfWBC$subject_id))
# It has text
# unique(DfHR$value)
# all Numeric (or null)
# unique(DfDBP$value)
# unique(DfSBP$value)
# unique(DfRR$value)
# unique(DfTEMP$value)
# unique(DfO2S$value)
# unique(DfWBC$value)

The temperature could be recorded as Celsius or Fahrenheit, therefore, all the measurements were transformed to Celsius.

DfHR$valuenum <- as.numeric(DfHR$valuenum)
## Warning: NAs introduced by coercion
DfSBP$valuenum <- as.numeric(DfSBP$valuenum)
## Warning: NAs introduced by coercion
DfDBP$valuenum <- as.numeric(DfDBP$valuenum)
DfRR$valuenum <- as.numeric(DfRR$valuenum)
## Warning: NAs introduced by coercion
DfTEMP$valuenum <- as.numeric(DfTEMP$valuenum)
## Warning: NAs introduced by coercion
DfO2S$valuenum <- as.numeric(DfO2S$valuenum)
## Warning: NAs introduced by coercion
DfWBC$valuenum <- as.numeric(DfWBC$valuenum)
## Warning: NAs introduced by coercion
# Create a new column in DfTEMP (temp) so all the records are in Celsius.

DfTEMP$temp = DfTEMP$valuenum

DfTEMP[, temp := round(((DfTEMP$valuenum-32) * (5/9)), 1)]

DfTEMP[itemid == 676 |itemid == 677 |itemid == 223762 , temp := round(DfTEMP$valuenum, 1)]
## Warning in `[.data.table`(DfTEMP, itemid == 676 | itemid == 677 | itemid
## == : Supplied 1422996 items to be assigned to 626360 items of column
## 'temp' (796636 unused)
DfTEMP$tempC[DfTEMP$itemid %in% c(676, 677, 223762)] <- DfTEMP$valuenum[DfTEMP$itemid %in% c(676, 677, 223762)]
DfTEMP$tempC[DfTEMP$itemid %in% c(678, 679, 223761)] <- DfTEMP$temp[DfTEMP$itemid %in% c(678, 679, 223761)]
# length(unique(DfHR$icustay_id))
# length(unique(DfDBP$icustay_id))
# length(unique(DfO2S$icustay_id))
# #length(unique(DfPH$icustay_id))
# length(unique(DfRR$icustay_id))
# length(unique(DfSBP$icustay_id))
# length(unique(DfTEMP$icustay_id))
# length(unique(DfWBC$subject_id))

Quality control

Firstly, I check that all the records were measured in the same unit for each dynamic variable.

unique(DfHR$valueuom)
unique(DfDBP$valueuom)
unique(DfSBP$valueuom)
unique(DfRR$valueuom)
unique(DfTEMP$valueuom)
unique(DfO2S$valueuom)
unique(DfWBC$valueuom)
  • Heart rate: “bpm” “BPM”
  • DBP: mmHg
  • SBP: mmHg, NULL
  • Respiration rate: insp/min, BPM (breaths/min) and NULL
  • Temperature: ?F, ?C, Deg. C, Deg .F and NULL
  • Blood oxygen saturation: % and NULL
  • White blood cell counts: K/ul and NULL

Double check the NULL values:

DfSBP[valueuom == "NULL"]
DfRR[valueuom == "NULL"]
DfTEMP[valueuom == "NULL"]
DfO2S[valueuom == "NULL"]
DfWBC[valueuom == "NULL"]
  • SBP: NULL units correspond to NULL value (5,729).
  • RR: NULL units correspond to NULL value (2,387).
  • TEMP: NULL units correspond to NULL value (5,317).
  • O2S: NULL units correspond to NULL value (332).
  • WBC: NULL units correspond to numeric values (18).

3. Defining the ranges limits for each dynamic variable: detection of outliers

Several approaches were tested to define reasonable boundaries for each of the dynamic categories:

Domain knowledge Medical guidelines Johnson et al (Github) Statistical approach
HR 200/40 252/0 300/0 257/22
SBP 250/50 502/0 400/0 341/33
DBP 160/0 671/0 300/0 308/5
RR 40/10 4084/0 70/0 91/2
TEMP 50/24 50/27
SpO2 100/40 11751/0 100/0 132/70
WBC 30/0

The number of WBC records was very low and therefore, this variable was excluded in the analysis at this point.

# just once for HR 
library(MASS)
alpha = 0.01
x <- DfHR[DfHR$valuenum > 0,]$valuenum

N <- length(x)

aux <- boxcox( lm(x~ 1) )

lambda <- aux$x[which(aux$y == max(aux$y))]
x1 <- (x^lambda - 1)/lambda # (with x1 <- log(x) if lambda = 0)
lowertrans <- qnorm(alpha/N, mean = mean(x1), sd = sd(x1))
uppertrans <- qnorm(1-alpha/N, mean = mean(x1), sd = sd(x1))

lower <- (lowertrans*lambda + 1)^(1/lambda) 
upper <- (uppertrans*lambda + 1)^(1/lambda) 
HR_StatRange <- StatisticalRange(DfHR[DfHR$valuenum > 0,]$valuenum)
SBP_StatRange <- StatisticalRange(DfSBP[DfSBP$valuenum > 0,]$valuenum)

DBP_StatRange <- StatisticalRange(DfDBP[DfDBP$valuenum >= 0,]$valuenum)
O2S_StatRange <- StatisticalRange(DfO2S[DfO2S$valuenum > 0,]$valuenum)
TEMP_StatRange <- StatisticalRange(DfTEMP[DfTEMP$tempC > 0,]$tempC)
WBC_StatRange <- StatisticalRange(DfWBC[DfWBC$valuenum > 0,]$valuenum)


par(mfrow=c(4,2))
hist(DfHR$valuenum, xlim = c(0, 400), breaks = 10000, main= "Distribution HR values")
abline(v = HR_StatRange$lower, col = "red")
abline(v = HR_StatRange$upper, col = "red")
hist(DfSBP$valuenum, xlim = c(0, 400), breaks = 10000, main= "Distribution SBP values")
abline(v = SBP_StatRange$lower, col = "red")
abline(v = SBP_StatRange$upper, col = "red")
hist(DfDBP$valuenum, xlim = c(0, 400), breaks = 10000, main= "Distribution DBP values")
abline(v = DBP_StatRange$lower, col = "red")
abline(v = DBP_StatRange$upper, col = "red")
RR_StatRange <- StatisticalRange(DfRR[DfRR$valuenum > 0,]$valuenum)
hist(DfRR$valuenum, xlim = c(0, 10000), breaks = 1000, main= "Distribution RR values")
abline(v = RR_StatRange$lower, col = "red")
abline(v = RR_StatRange$upper, col = "red")
hist(DfO2S$valuenum, ylim = c(0, 3000000), xlim = c(0, 1000), breaks = 10000, main= "Distribution O2S values")
abline(v = O2S_StatRange$lower, col = "red")
abline(v = O2S_StatRange$upper, col = "red")
hist(DfTEMP$temp, xlim = c(0, 1000), breaks = 10000, main= "Distribution TEMP values")
abline(v = TEMP_StatRange$lower, col = "red")
abline(v = TEMP_StatRange$upper, col = "red")
hist(DfWBC$valuenum, xlim = c(0, 400), breaks = 10000, main= "Distribution WBC values")
abline(v = WBC_StatRange$lower, col = "red")
abline(v = WBC_StatRange$upper, col = "red")

HR_adj <- DfHR[DfHR$valuenum >= 22 & DfHR$valuenum <= 257, ]
SBP_adj <- DfSBP[DfSBP$valuenum >= 33 & DfSBP$valuenum <= 341 , ]
DBP_adj <- DfDBP[DfDBP$valuenum >= 5 & DfDBP$valuenum <= 308, ]
RR_adj <- DfRR[DfRR$valuenum >= 2 & DfRR$valuenum <= 91, ]
TEMP_adj <- DfTEMP[DfTEMP$tempC >= 27 & DfTEMP$tempC <= 50, ]
O2S_adj <- DfO2S[DfO2S$valuenum >= 70 & DfO2S$valuenum <= 132, ]
#WBC_adj <- DfWBC[DfWBC$valuenum > 0 & DfWBC$valuenum < 30, ]


# length(unique(HR_adj$subject_id))
# length(unique(SBP_adj$subject_id))
# length(unique(DBP_adj$subject_id))
# length(unique(RR_adj$subject_id))
# length(unique(TEMP_adj$subject_id))
# length(unique(O2S_adj$subject_id))
# length(unique(WBC_adj$subject_id))

After excluding outliers:

  • Heart rate: 25,145
  • Systolic blood pressure: 25,142
  • Diastolic blood pressure: 25,142
  • Blood oxygen saturation: 25,134
  • Respiration rate: 25,133
  • Temperature: 25,087

4. Merging the datasets

  • SBP.
# SBP
SBP_tables <- mergeDynChart(SBP_adj, sbp)
SBP_merge <- SBP_tables$mergeDF
SBP_hours <- SBP_tables$mergeHours

# DBP
DBP_tables <- mergeDynChart(DBP_adj, dbp)
DBP_merge <- DBP_tables$mergeDF
DBP_hours <- DBP_tables$mergeHours

# HR
HR_tables <- mergeDynChart(HR_adj, hr)
HR_merge <- HR_tables$mergeDF
HR_hours <- HR_tables$mergeHours

# O2S
O2S_tables <- mergeDynChart(O2S_adj, o2s)
O2S_merge <- O2S_tables$mergeDF
O2S_hours <- O2S_tables$mergeHours

# RR
RR_tables <- mergeDynChart(RR_adj, rr)
RR_merge <- RR_tables$mergeDF
RR_hours <- RR_tables$mergeHours

# Special case Temperature

TEMP_merge <- merge(TEMP_adj[, c("icustay_id", "subject_id", "itemid", "charttime", "tempC")], PatientInfo,  by = c("icustay_id", "subject_id"), all = TRUE )
  
TEMP_merge$time <- round(unclass(difftime(as.POSIXlt(TEMP_merge$charttime), as.POSIXlt(TEMP_merge$intime), units="hours")))
  
TEMP_hours <- aggregate(as.numeric(tempC) ~ icustay_id + subject_id + death_icu + time,  data = TEMP_merge, FUN = "mean", na.rm=FALSE, na.action = na.pass)

names(TEMP_hours) <- c("icustay_id", "subject_id", "death_icu", "time", "tempC")
  
TEMP_hours <- TEMP_hours[TEMP_hours$time >= 0,]
TEMP_hours[5] <- round(TEMP_hours[5], 2)
  
# Special case: WBC
#WBC_tables <- mergeDynLab(WBC_adj, wbc)
#WBC_merge <- WBC_tables$mergeDF
#WBC_hours <- WBC_tables$mergeHours
## SBP 
# It should return 25,145
# length(unique(SBP_merge$subject_id))
# length(unique(SBP_merge$icustay_id))
# It should return 25,142
# length(unique(SBP_hours$subject_id))
# length(unique(SBP_hours$icustay_id))
# setdiff(sort(unique(SBP_merge$subject_id)), sort(unique(SBP_hours$subject_id)))
# setdiff(sort(unique(SBP_merge$icustay_id)), sort(unique(SBP_hours$icustay_id)))
# SBP_merge[subject_id %in% c(41874,64590, 73583 ),]

## DBP
# It should return 25,145
# length(unique(DBP_merge$subject_id))
# length(unique(DBP_merge$icustay_id))
# It should return 25,142
# length(unique(DBP_hours$subject_id))
# length(unique(DBP_hours$icustay_id))
# setdiff(sort(unique(DBP_merge$subject_id)), sort(unique(DBP_hours$subject_id)))
# setdiff(sort(unique(DBP_merge$icustay_id)), sort(unique(DBP_hours$icustay_id)))
# DBP_merge[subject_id %in% c(41874,64590, 73583 ),]

## RR
# It should return 25,145
# length(unique(RR_merge$subject_id))
# length(unique(RR_merge$icustay_id))
# It should return 25,131
# length(unique(RR_hours$subject_id))
# length(unique(RR_hours$icustay_id))
# setdiff(sort(unique(RR_merge$subject_id)), sort(unique(RR_hours$subject_id)))
# setdiff(sort(unique(RR_merge$icustay_id)), sort(unique(RR_hours$icustay_id)))
# RR_merge[subject_id %in% c( 1628,6306,6314,9986,12840,13673,15302,15944,16980,20099,21542,24956,44611,97922 ),]

## HR
# It should return 25145
# length(unique(HR_merge$subject_id))
# length(unique(HR_merge$icustay_id))
# It should return 25145
# length(unique(HR_hours$subject_id))
# length(unique(HR_hours$icustay_id))

## O2S
# It should return 25,145
# length(unique(O2S_merge$subject_id))
# length(unique(O2S_merge$icustay_id))
# It should return 25,133
# length(unique(O2S_hours$subject_id))
# length(unique(O2S_hours$icustay_id))
# setdiff(sort(unique(O2S_merge$subject_id)), sort(unique(O2S_hours$subject_id)))
# setdiff(sort(unique(O2S_merge$icustay_id)), sort(unique(O2S_hours$icustay_id)))
# O2S_merge[subject_id %in% c( 1628,4607, 16007, 24256, 49395, 67813, 68439, 70912, 77589, 87174),]


## TEMP
# It should return 25,145
# length(unique(TEMP_merge$subject_id))
# length(unique(TEMP_merge$icustay_id))
# It should return 25,078
# length(unique(TEMP_hours$subject_id))
# length(unique(TEMP_hours$icustay_id))

## WBC
# It should return 25,145 patients
# length(unique(WBC_merge$subject_id))
# length(unique(WBC_merge$icustay_id))
# It should return 24,734
# length(unique(WBC_hours$subject_id))
# length(unique(WBC_hours$icustay_id))
# setdiff(sort(unique(WBC_merge$subject_id)), sort(unique(WBC_hours$subject_id)))
# setdiff(sort(unique(WBC_merge$icustay_id)), sort(unique(WBC_hours$icustay_id)))
# WBC_merge[subject_id %in% setdiff(sort(unique(WBC_merge$subject_id)), sort(unique(WBC_hours$subject_id)))[1:100],]

4. Exploratory analysis of the dynamic variables

Heart Rate

BasicHR <- basicAnalysis(HR_hours, hr, "Heart rate")

MedianHR360 <- medianValuesPlot(died1=BasicHR$died1, died0=BasicHR$died0, dynvariable=hr, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinHR360 <- PlotMaxMin(died1=BasicHR$died1, died0=BasicHR$died0, dynvariable="Heart rate", hours=360)
MedianHR720 <- medianValuesPlot(died1=BasicHR$died1, died0=BasicHR$died0, dynvariable=hr, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinHR720 <- PlotMaxMin(died1=BasicHR$died1, died0=BasicHR$died0, dynvariable="Heart rate", hours=720)

par(mfrow=c(2,2))
MedianHR360$plotMedian

MedianHR720$plotMedian

PlotMaxMinHR360$plotMaxMin

PlotMaxMinHR720$plotMaxMin

SpaguettiPlot360HR <- SpaguettiPlot(died1=BasicHR$died1, died0=BasicHR$died0, "Heart Rate", patients=50, hours=360)

par(mfrow=c(1,1))

BoxPlot4hHR <- BoxPlot4h(died0=BasicHR$died0, died1=BasicHR$died1, HR_hours, "HR", ylab=200)
BoxPlot4hHR$BoxPlot

Systolic Blood Pressure

BasicSBP <- basicAnalysis(SBP_hours, sbp, "Systolic Blood Pressure")

MedianSBP360 <- medianValuesPlot(died1=BasicSBP$died1, died0=BasicSBP$died0, dynvariable=sbp, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinSBP360 <- PlotMaxMin(died1=BasicSBP$died1, died0=BasicSBP$died0, dynvariable="Systolic Blood Pressure", hours=360)
MedianSBP720 <- medianValuesPlot(died1=BasicSBP$died1, died0=BasicSBP$died0, dynvariable=sbp, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinSBP720 <- PlotMaxMin(died1=BasicSBP$died1, died0=BasicSBP$died0, dynvariable="Systolic Blood Pressure", hours=720)

par(mfrow=c(2,2))
MedianSBP360$plotMedian

PlotMaxMinSBP360$plotMaxMin

MedianSBP720$plotMedian

PlotMaxMinSBP720$plotMaxMin

SpaguettiPlot360SBP <- SpaguettiPlot(died1=BasicSBP$died1, died0=BasicSBP$died0, "Systolic Blood Pressure", patients=50, hours=360)

par(mfrow=c(1,1))

BoxPlot4hSBP <- BoxPlot4h(died0=BasicSBP$died0, died1=BasicSBP$died1, SBP_hours, "Systolic Blood Pressure", ylab=300)
BoxPlot4hSBP$BoxPlot

Diastolic Blood Pressure

BasicDBP <- basicAnalysis(DBP_hours, sbp, "Diastolic Blood Pressure")

MedianDBP360 <- medianValuesPlot(died1=BasicDBP$died1, died0=BasicDBP$died0, dynvariable=dbp, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
MedianDBP720 <- medianValuesPlot(died1=BasicDBP$died1, died0=BasicDBP$died0, dynvariable=dbp, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinDBP360 <- PlotMaxMin(died1=BasicDBP$died1, died0=BasicDBP$died0, dynvariable="Diastolic Blood Pressure", hours=360)
PlotMaxMinDBP720 <- PlotMaxMin(died1=BasicDBP$died1, died0=BasicDBP$died0, dynvariable="Diastolic Blood Pressure", hours=720)

par(mfrow=c(2,2))
MedianDBP360$plotMedian

MedianDBP720$plotMedian

PlotMaxMinDBP360$plotMaxMin

PlotMaxMinDBP720$plotMaxMin

SpaguettiPlot360DBP <- SpaguettiPlot(died1=BasicDBP$died1, died0=BasicDBP$died0, "Diastolic Blood Pressure", patients=50, hours=360)

par(mfrow=c(1,1))

BoxPlot4hDBP <- BoxPlot4h(died0=BasicDBP$died0, died1=BasicDBP$died1, DBP_hours, "Diastolic Blood Pressure", ylab=250)
BoxPlot4hDBP$BoxPlot

Respiration Rate

BasicRR <- basicAnalysis(RR_hours, rr, "Respiration Rate")

MedianRR360 <- medianValuesPlot(died1=BasicRR$died1, died0=BasicRR$died0, dynvariable=rr, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
MedianRR720 <- medianValuesPlot(died1=BasicRR$died1, died0=BasicRR$died0, dynvariable=rr, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinRR360 <- PlotMaxMin(died1=BasicRR$died1, died0=BasicRR$died0, dynvariable="Respiration Rate", hours=360)
PlotMaxMinRR720 <- PlotMaxMin(died1=BasicRR$died1, died0=BasicRR$died0, dynvariable="Respiration Rate", hours=720)

par(mfrow=c(2,2))
MedianRR360$plotMedian

MedianRR720$plotMedian

PlotMaxMinRR360$plotMaxMin

PlotMaxMinRR720$plotMaxMin

par(mfrow=c(2,2))
SpaguettiPlot360RR <- SpaguettiPlot(died1=BasicDBP$died1, died0=BasicDBP$died0, "Respiration Rate", patients=50, hours=360)

par(mfrow=c(1,1))

BoxPlot4hRR <- BoxPlot4h(died0=BasicRR$died0, died1=BasicRR$died1, RR_hours, "Respiration Rate", ylab=100)
BoxPlot4hRR$BoxPlot

Blood Oxygen Saturation

BasicO2S <- basicAnalysis(O2S_hours, o2s, "Blood Oxygen Saturation")

MedianO2S360 <- medianValuesPlot(died1=BasicO2S$died1, died0=BasicO2S$died0, dynvariable=o2s, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
MedianO2S720 <- medianValuesPlot(died1=BasicO2S$died1, died0=BasicO2S$died0, dynvariable=o2s, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinO2S360 <- PlotMaxMin(died1=BasicO2S$died1, died0=BasicO2S$died0, dynvariable= "Blood Oxygen Saturation", hours=360)
PlotMaxMinO2S720 <- PlotMaxMin(died1=BasicO2S$died1, died0=BasicO2S$died0, dynvariable="Blood Oxygen Saturation", hours=720)

par(mfrow=c(2,2))
MedianO2S360$plotMedian

MedianO2S720$plotMedian

PlotMaxMinO2S360$plotMaxMin

PlotMaxMinO2S720$plotMaxMin

SpaguettiPlot360O2S <- SpaguettiPlot(died1=BasicO2S$died1, died0=BasicO2S$died0, "Blood Oxygen Saturation", patients=50, hours=360)

par(mfrow=c(1,1))

BoxPlot4hO2S <- BoxPlot4h(died0=BasicO2S$died0, died1=BasicO2S$died1, O2S_hours, "Blood Oxygen Saturation", ylab=110)
BoxPlot4hO2S$BoxPlot

Temperature

BasicTEMP <- basicAnalysis(TEMP_hours, tempC, "Temperature")

MedianTEMP360 <- medianValuesPlot(died1=BasicTEMP$died1, died0=BasicTEMP$died0, dynvariable=tempC, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
MedianTEMP720 <- medianValuesPlot(died1=BasicTEMP$died1, died0=BasicTEMP$died0, dynvariable=tempC, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinTEMP360 <- PlotMaxMin(died1=BasicTEMP$died1, died0=BasicTEMP$died0, dynvariable="Temperature", hours=360)
PlotMaxMinTEMP720 <- PlotMaxMin(died1=BasicTEMP$died1, died0=BasicTEMP$died0, dynvariable="Temperature", hours=720)

par(mfrow=c(2,2))
MedianTEMP360$plotMedian

MedianTEMP720$plotMedian

PlotMaxMinTEMP360$plotMaxMin

PlotMaxMinTEMP720$plotMaxMin

par(mfrow=c(2,2))
SpaguettiPlot360TEMP <- SpaguettiPlot(died1=BasicTEMP$died1, died0=BasicTEMP$died0, "Temperature", patients=50, hours=360)

par(mfrow=c(1,1))

BoxPlot4hTEMP <- BoxPlot4h(died0=BasicTEMP$died0, died1=BasicTEMP$died1, TEMP_hours, "Temperature", ylab=50)
BoxPlot4hTEMP$BoxPlot

Exploratory analysis with specific ITEMID: Systolic Arterial BP

Systolic Blood Pressure can be measured invasely or non-invasively. In this case, we will run the analysis only taken into account the measurements performed directly in the artery (itemid: 6, 51, 6701, 220050). 14,774 patients were recorded using this itemid.

SBP_arterial <- SBP_merge[SBP_merge$itemid %in% c(6, 51, 6701, 220050),]

SBPA_hours <- aggregate(as.numeric(valuenum) ~ icustay_id + subject_id + death_icu + time,  data = SBP_arterial, FUN = "mean", na.rm=FALSE, na.action = na.pass)

names(SBPA_hours) <- c("icustay_id", "subject_id", "death_icu", "time", "sbp")

SBPA_hours <- SBPA_hours[SBPA_hours$time >= 0, ]
SBPA_hours$sbp <- round(SBPA_hours$sbp, 2)

print(" First 6 rows")  
## [1] " First 6 rows"
head(SBPA_hours)
##      icustay_id subject_id death_icu time sbp
## 1220     263738         13         0    0 151
## 1221     262236         24         0    0 137
## 1222     295037         32         0    0 108
## 1223     274249         45         0    0 143
## 1224     249195         49         0    0 104
## 1225     297831         77         0    0 127
length(unique(SBPA_hours$icustay_id))
## [1] 14770
BasicSBPA <- basicAnalysis(SBPA_hours, sbp, "SBP Arterial")

MedianSBPA360 <- medianValuesPlot(died1=BasicSBPA$died1, died0=BasicSBPA$died0, dynvariable=sbp, hours=360)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
MedianSBPA720 <- medianValuesPlot(died1=BasicSBPA$died1, died0=BasicSBPA$died0, dynvariable=sbp, hours=720)
## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced
PlotMaxMinSBPA360 <- PlotMaxMin(died1=BasicSBPA$died1, died0=BasicSBPA$died0, dynvariable="SBP Arterial", hours=360)
PlotMaxMinSBPA720 <- PlotMaxMin(died1=BasicSBPA$died1, died0=BasicSBPA$died0, dynvariable="SBP Arterial", hours=720)

par(mfrow=c(2,2))
MedianSBPA360$plotMedian

MedianSBPA720$plotMedian

PlotMaxMinSBPA360$plotMaxMin

PlotMaxMinSBPA720$plotMaxMin

par(mfrow=c(2,2))
SpaguettiPlot360SBPA <- SpaguettiPlot(died1=BasicSBPA$died1, died0=BasicSBPA$died0, "SBP Arterial", patients=50, hours=360)

par(mfrow=c(1,1))

BoxPlot4hSBPA <- BoxPlot4h(died0=BasicSBPA$died0, died1=BasicSBPA$died1, SBPA_hours, "SBP Arterial", ylab=280)
BoxPlot4hSBPA$BoxPlot

5. Extracting the 4h cohort

#nrow(SBP_hours[SBP_hours$time < 4, ])
#68571
#nrow(HR_hours[HR_hours$time < 4, ])
#70053

HS_4h <- merge(HR_hours[HR_hours$time < 4, ], SBP_hours[SBP_hours$time < 4, ], by = c("icustay_id", "subject_id", "death_icu", "time"), all = TRUE)

#nrow(HS_4h)
#70299

HSD_4h <- merge(HS_4h, DBP_hours[DBP_hours$time < 4, ], by = c("icustay_id", "subject_id", "death_icu", "time"), all = TRUE)

# Create pulse pressure
HSD_4h$pp <- HSD_4h$sbp - HSD_4h$dbp
#nrow(HSD_4h)
#70299

HSDPR_4h <- merge(HSD_4h, RR_hours[RR_hours$time < 4, ], by = c("icustay_id", "subject_id", "death_icu", "time"), all = TRUE)

HSDPRO_4h <- merge(HSDPR_4h, O2S_hours[O2S_hours$time < 4, ], by = c("icustay_id", "subject_id", "death_icu", "time"), all = TRUE)

HSDPROT_4h <- merge(HSDPRO_4h, TEMP_hours[TEMP_hours$time < 4, ], by = c("icustay_id", "subject_id", "death_icu", "time"), all = TRUE)

#nrow(HSDPROT_4h)
#72350

The total merge dataset has 23,124 patients. However, because many patients have more than 1 missing value for temperature at the different 4 time points, this variable is excluded at this point.

Df4h <- HSDPRO_4h
patients4h <- unique(Df4h$icustay_id) 
length(patients4h)
## [1] 23107
i = 1

for (patient in patients4h){
  
  if (length(is.na(Df4h[Df4h$icustay_id == patient & Df4h$time == 0,])) == 0){
      new_time <- c(patient,
                    Df4h[Df4h$icustay_id == patient,]$subject_id[1],
                    Df4h[Df4h$icustay_id == patient,]$death_icu[1],
                    0,
                    NA, NA, NA, NA, NA, NA)
      Df4h <- rbind(Df4h, new_time)
  }  
  if (length(is.na(Df4h[Df4h$icustay_id == patient & Df4h$time == 1,])) == 0){
      new_time <- c(patient,
                    Df4h[Df4h$icustay_id == patient,]$subject_id[1],
                    Df4h[Df4h$icustay_id == patient,]$death_icu[1],
                    1,
                    NA, NA, NA, NA, NA, NA)
      Df4h <- rbind(Df4h, new_time)
  }  
  if (length(is.na(Df4h[Df4h$icustay_id == patient & Df4h$time == 2,])) == 0){
      new_time <- c(patient,
                    Df4h[Df4h$icustay_id == patient,]$subject_id[1],
                    Df4h[Df4h$icustay_id == patient,]$death_icu[1],
                    2,
                    NA, NA, NA, NA, NA, NA)
      Df4h <- rbind(Df4h, new_time)
  }  
  if (length(is.na(Df4h[Df4h$icustay_id == patient & Df4h$time == 3,])) == 0){
      new_time <- c(patient,
                    Df4h[Df4h$icustay_id == patient,]$subject_id[1],
                    Df4h[Df4h$icustay_id == patient,]$death_icu[1],
                    3,
                    NA, NA, NA, NA, NA, NA)
      Df4h <- rbind(Df4h, new_time)  
  }
}

nrow(HSDPROT_4h)
## [1] 72350
nrow(Df4h)
## [1] 92428
length(unique(HSDPROT_4h$icustay_id))
## [1] 23124
length(unique(Df4h$icustay_id))
## [1] 23107

After completing the table with all times: 92496 rows.

drop_patients <- list()

for (patient in patients4h) {

  if (length(which(is.na(Df4h[Df4h$icustay_id == patient,]$hr))) > 1 || 
    length(which(is.na(Df4h[Df4h$icustay_id == patient,]$sbp))) > 1 ||
    length(which(is.na(Df4h[Df4h$icustay_id == patient,]$dbp))) > 1 ||
    length(which(is.na(Df4h[Df4h$icustay_id == patient,]$pp))) > 1 ||
    length(which(is.na(Df4h[Df4h$icustay_id == patient,]$rr))) > 1 ||
    length(which(is.na(Df4h[Df4h$icustay_id == patient,]$o2s))) > 1)
    {
      drop_patients[i] <- patient
      i = i + 1
      #Df4h[-Df4h$icustay_id == patient,]
      #print(head(Df4h))
      #print(patient)
  }

}

length(drop_patients)
## [1] 8363
prueba <- Df4h[!Df4h$icustay_id %in% drop_patients,]
length(unique(prueba$icustay_id))
## [1] 14744
raw_4H <- prueba[order(prueba$icustay_id, prueba$time),]

Patients with data for the first 4h: 23,124. Number of patients with more than 1 missing value for a variable: 8,435. Number of patients in the final dataset: 14,744.

  • Imputation of missing values

The missing values were imputted based on the mean between data point before and after the missing value (ie. 2 and 3), or by duplicating the value after (the case for the first value), or duplicating the value before (the case for the fourth value).

Summary of missing values: - HR: 5483 rows - SBP: 5984 rows - DBP: 5988 rows - PP: 5992 rows - RR: 5422 rows - O2S: 5903 rows

summary(raw_4H)
##    icustay_id       subject_id      death_icu           time     
##  Min.   :200003   Min.   :    3   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:225192   1st Qu.:13528   1st Qu.:0.0000   1st Qu.:0.75  
##  Median :249887   Median :27134   Median :0.0000   Median :1.50  
##  Mean   :249985   Mean   :37261   Mean   :0.1069   Mean   :1.50  
##  3rd Qu.:274880   3rd Qu.:61364   3rd Qu.:0.0000   3rd Qu.:2.25  
##  Max.   :299992   Max.   :99999   Max.   :1.0000   Max.   :3.00  
##                                                                  
##        hr              sbp             dbp               pp         
##  Min.   : 26.00   Min.   : 47.0   Min.   : 10.00   Min.   :-162.00  
##  1st Qu.: 73.00   1st Qu.:105.5   1st Qu.: 53.00   1st Qu.:  46.00  
##  Median : 84.00   Median :119.2   Median : 62.00   Median :  56.00  
##  Mean   : 86.28   Mean   :121.7   Mean   : 63.29   Mean   :  58.44  
##  3rd Qu.: 98.00   3rd Qu.:136.0   3rd Qu.: 72.00   3rd Qu.:  69.00  
##  Max.   :187.33   Max.   :254.2   Max.   :282.00   Max.   : 166.00  
##  NA's   :5483     NA's   :5984    NA's   :5988     NA's   :5992     
##        rr             o2s        
##  Min.   : 2.00   Min.   : 70.00  
##  1st Qu.:14.50   1st Qu.: 96.00  
##  Median :17.67   Median : 98.25  
##  Mean   :18.40   Mean   : 97.61  
##  3rd Qu.:21.00   3rd Qu.:100.00  
##  Max.   :91.00   Max.   :100.00  
##  NA's   :5422    NA's   :5903
raw_4H2 <- raw_4H

## columns:
# HR - 5
# SBP - 6
# DBP - 7
# PP - 8
# RR - 9
# O2S - 10


raw_4H3 <- NAimputation(raw_4H2, 5)
raw_4H4 <- NAimputation(raw_4H3, 6)
raw_4H5 <- NAimputation(raw_4H4, 7)
raw_4H6 <- NAimputation(raw_4H5, 8)
raw_4H7 <- NAimputation(raw_4H6, 9)
raw_4H8 <- NAimputation(raw_4H7, 10)

nrow(raw_4H8)
## [1] 58976
length(unique(raw_4H8$icustay_id))
## [1] 14744
summary(raw_4H8)
##    icustay_id       subject_id      death_icu           time     
##  Min.   :200003   Min.   :    3   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:225192   1st Qu.:13528   1st Qu.:0.0000   1st Qu.:0.75  
##  Median :249887   Median :27134   Median :0.0000   Median :1.50  
##  Mean   :249985   Mean   :37261   Mean   :0.1069   Mean   :1.50  
##  3rd Qu.:274880   3rd Qu.:61364   3rd Qu.:0.0000   3rd Qu.:2.25  
##  Max.   :299992   Max.   :99999   Max.   :1.0000   Max.   :3.00  
##        hr              sbp             dbp               pp         
##  Min.   : 26.00   Min.   : 47.0   Min.   : 10.00   Min.   :-162.00  
##  1st Qu.: 73.00   1st Qu.:106.0   1st Qu.: 53.50   1st Qu.:  46.00  
##  Median : 84.00   Median :120.0   Median : 62.00   Median :  56.33  
##  Mean   : 86.26   Mean   :122.1   Mean   : 63.61   Mean   :  58.53  
##  3rd Qu.: 98.00   3rd Qu.:136.0   3rd Qu.: 72.00   3rd Qu.:  69.00  
##  Max.   :187.33   Max.   :254.2   Max.   :282.00   Max.   : 166.00  
##        rr             o2s        
##  Min.   : 2.00   Min.   : 70.00  
##  1st Qu.:14.50   1st Qu.: 96.00  
##  Median :17.50   Median : 98.40  
##  Mean   :18.34   Mean   : 97.63  
##  3rd Qu.:21.00   3rd Qu.:100.00  
##  Max.   :91.00   Max.   :100.00

The final dataset after imputation: 58,976 rows, 14,744 patients

Saving the data frame

#save(raw_4H8, 
#     file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicCohort4H.Rda")


# write.table(raw_4H8, 
#  file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicCohort4H.csv",
#             row.names=FALSE, na="",col.names=TRUE, sep="\t")

# write.table(SBP_hours, 
#  file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicSBP_hours.csv",
#             row.names=FALSE, na="",col.names=TRUE, sep="\t")
# 
# write.table(HR_hours, 
#  file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicHR_hours.csv",
#             row.names=FALSE, na="",col.names=TRUE, sep="\t")
# 
# write.table(DBP_hours,
# file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicDBP_hours.csv",
#          row.names=FALSE, na="",col.names=TRUE, sep="\t")
# 
# write.table(RR_hours,
# file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicRR_hours.csv",
#          row.names=FALSE, na="",col.names=TRUE, sep="\t")
# 
# write.table(O2S_hours,
# file = "/Users/maria/Desktop/MIMIC-III/dynamic_data/DynamicO2S_hours.csv",
#          row.names=FALSE, na="",col.names=TRUE, sep="\t")

Additional information

sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.1
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plyr_1.8.4          MASS_7.3-47         ggplot2_2.2.1      
## [4] data.table_1.10.4-3
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.14     knitr_1.18       magrittr_1.5     munsell_0.4.3   
##  [5] colorspace_1.3-2 rlang_0.2.0      stringr_1.2.0    tools_3.4.2     
##  [9] grid_3.4.2       gtable_0.2.0     htmltools_0.3.6  yaml_2.1.16     
## [13] lazyeval_0.2.1   rprojroot_1.3-2  digest_0.6.12    tibble_1.4.2    
## [17] evaluate_0.10.1  rmarkdown_1.8    labeling_0.3     stringi_1.1.6   
## [21] compiler_3.4.2   pillar_1.1.0     scales_0.5.0     backports_1.1.2